#!/usr/bin/env python
# -*- encoding: utf-8 -*-
'''
@Time    :   2021/08/14 14:40:54
@Author  :   Leo Wood 
@Contact :   leowood@foxmail.com
'''

from selenium import webdriver
from bs4 import BeautifulSoup
import os
import random
import time
from tqdm import tqdm

def get_driver():
    # driver = webdriver.Chrome()


    chromedriver = r'C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe'
    os.environ["webdriver.chrome.driver"] = chromedriver
    driver = webdriver.Ie(chromedriver)
    return driver


def get_html(word_list_path):
    with open(word_list_path + '/word_list.txt','r',encoding='utf-8') as f:
        word_list = [line.strip() for line in f.readlines()]

    driver = get_driver()
    for word in tqdm(word_list):
        if all_chinese(word):
            url = "https://kmcha.com/similar/{}".format(word)
            driver.get(url)
            html = BeautifulSoup(driver.page_source, "html.parser")
            with open(word_list_path + "/html/" + word+".html",'w',encoding='utf-8') as f:
                f.write(html.prettify())
            
            time.sleep(random.randint(2,4))


def get_content_from_html(path):
    with open(path,'r') as f:
        soup = BeautifulSoup(f.read(), "html.parser") 
        print(soup.prettify())


def all_chinese(text):
    if text:
        for char in text:
            if not (char >= '\u4e00' and char <= '\u9fa5'):
                return False
        return True
    return False




if __name__ == '__main__':
    # get_html(["肝结核","肺结核"])
    # get_content_from_html("肺结核_.html")
    get_html("Miss")
